{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T16:05:22.581880Z",
     "iopub.status.busy": "2023-09-15T16:05:22.581538Z",
     "iopub.status.idle": "2023-09-15T16:05:25.592596Z",
     "shell.execute_reply": "2023-09-15T16:05:25.591501Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import pickle\n",
    "from collections import Counter\n",
    "import polars as pl\n",
    "from statistics import mode\n",
    "\n",
    "# CHANGE\n",
    "PATH_OUTPUT = '' # curated data\n",
    "\n",
    "sys.path.append('../')\n",
    "import tools\n",
    "\n",
    "pl.Config.set_fmt_str_lengths(100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T16:05:25.597059Z",
     "iopub.status.busy": "2023-09-15T16:05:25.596460Z",
     "iopub.status.idle": "2023-09-15T16:05:30.071967Z",
     "shell.execute_reply": "2023-09-15T16:05:30.070984Z"
    }
   },
   "outputs": [],
   "source": [
    "# add manual matches\n",
    "\n",
    "def add_manual_matches(location, df, df_mapping_company_name_to_naics):\n",
    "    \"\"\"add manual matches from df_mapping_company_name_to_naics\"\"\"\n",
    "    return (\n",
    "        df\n",
    "        # adds column 'location with 'city', 'province', or 'national'\n",
    "        .with_columns(pl.lit(location).alias('location'))\n",
    "        .join(df_mapping_company_name_to_naics, on='cleaned_name', how='left', suffix='_manual')\n",
    "        .with_columns(\n",
    "            [\n",
    "                (\n",
    "                    pl.when(pl.col('manual'))  # score=1 if naics manually imputed\n",
    "                    .then(1)\n",
    "                    .otherwise(pl.col('score'))\n",
    "                    .alias('score')\n",
    "                ),\n",
    "                (\n",
    "                    pl.when(pl.col('manual'))  # \n",
    "                    .then(pl.col('naics_manual'))\n",
    "                    .otherwise(pl.col('naics'))\n",
    "                    .alias('naics')\n",
    "                ),\n",
    "                (\n",
    "                    pl.when(pl.col('manual'))\n",
    "                    .then('manual')\n",
    "                    .otherwise(pl.col('match'))\n",
    "                    .alias('match')\n",
    "                )      \n",
    "            ]\n",
    "        )\n",
    "        .drop(['manual', 'naics_manual'])\n",
    "    )\n",
    "\n",
    "# load manual matches and put them in a dataframe\n",
    "\n",
    "df_mapping_cleaned_name_to_naics = pl.read_parquet(\n",
    "    PATH_OUTPUT + 'df_mapping_cleaned_name_to_naics.parquet'\n",
    "    )\n",
    "\n",
    "df_matches_companies = {}\n",
    "for location in ['city', 'province', 'national']:\n",
    "    path = PATH_OUTPUT + f\"df_{location}.parquet\"\n",
    "    (\n",
    "        add_manual_matches(location, pl.read_parquet(path), df_mapping_cleaned_name_to_naics)\n",
    "        .write_parquet(path)\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T16:05:30.076200Z",
     "iopub.status.busy": "2023-09-15T16:05:30.075887Z",
     "iopub.status.idle": "2023-09-15T16:07:41.439978Z",
     "shell.execute_reply": "2023-09-15T16:07:41.438312Z"
    }
   },
   "outputs": [],
   "source": [
    "# define function to select matches according to cutoffs \n",
    "\n",
    "def join_matches_location(df_unmatched, df_matches, cutoff, additional_cols):\n",
    "    \"\"\"join matches for location\"\"\"\n",
    "    df_matched = (\n",
    "        df_unmatched\n",
    "        .join(\n",
    "            df_matches\n",
    "            .select(\n",
    "                ['cleaned_name', 'match', 'score', 'naics'] + additional_cols\n",
    "            )\n",
    "            .filter(pl.col('score') >= cutoff),\n",
    "            on=['cleaned_name'] + additional_cols,\n",
    "            how='inner'\n",
    "        )\n",
    "        .with_columns(pl.lit(location).alias('match_level')\n",
    "        )\n",
    "    )\n",
    "    df_unmatched = df_unmatched.join(df_matched, on='job_key', how='anti')\n",
    "    print(location)\n",
    "    return df_matched, df_unmatched\n",
    "\n",
    "\n",
    "# mapper to clean company names\n",
    "preprocess_mapper = pl.read_parquet(PATH_OUTPUT + \"preprocess_mapper.parquet\")\n",
    "\n",
    "# initial data without naics codes\n",
    "df_unmatched_jobs = (\n",
    "    pl.read_parquet(\n",
    "        PATH_OUTPUT + 'indeed_all_jobs.parquet',\n",
    "        columns=[\n",
    "            'job_key', 'job_title', 'company_name', 'city', 'province',\n",
    "            'date_first_visible', 'last_date', 'searchable_days'\n",
    "            ]\n",
    "    )\n",
    "    .join(\n",
    "        pl.read_parquet(PATH_OUTPUT + 'preprocess_mapper.parquet'), \n",
    "        on='company_name', how='left')\n",
    ")\n",
    "\n",
    "cutoffs = {\n",
    "    'city': 0.7,\n",
    "    'province': 0.7,\n",
    "    'national': 0.7\n",
    "}    \n",
    "\n",
    "additional_cols = {\n",
    "    'city': ['city', 'province'],\n",
    "    'province': ['province'],\n",
    "    'national': []\n",
    "}\n",
    "\n",
    "dfs_jobs_matched = {}\n",
    "for location in ['city', 'province', 'national']:\n",
    "    df_matches = pl.read_parquet(PATH_OUTPUT + f\"df_{location}.parquet\")\n",
    "    dfs_jobs_matched[location], df_unmatched_jobs = join_matches_location(\n",
    "        df_unmatched_jobs, \n",
    "        df_matches, \n",
    "        cutoffs[location], \n",
    "        additional_cols[location]\n",
    "    )\n",
    "df_all_jobs_matched = (\n",
    "    pl.concat(dfs_jobs_matched.values(), how='vertical')\n",
    ")\n",
    "\n",
    "df_all_jobs_matched = (\n",
    "    df_all_jobs_matched\n",
    "    .with_columns(\n",
    "        pl.Series([mode(l_naics) for l_naics in df_all_jobs_matched['naics'].to_list()])  # much faster than .apply()\n",
    "        .alias('modal_naics')\n",
    "    )\n",
    ") \n",
    "\n",
    "df_all_jobs_matched.write_parquet(PATH_OUTPUT + 'df_matches_all_naics.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T16:07:41.719802Z",
     "iopub.status.busy": "2023-09-15T16:07:41.719508Z",
     "iopub.status.idle": "2023-09-15T16:07:41.726115Z",
     "shell.execute_reply": "2023-09-15T16:07:41.725290Z"
    }
   },
   "outputs": [],
   "source": [
    "# count percentage of jobs matched\n",
    "count = {}\n",
    "for location in ['city', 'province', 'national']:\n",
    "    count[location] = dfs_jobs_matched[location].shape[0]\n",
    "count['unmatched'] = df_unmatched_jobs.shape[0]\n",
    "\n",
    "total = sum(count.values())\n",
    "\n",
    "for k in count:\n",
    "    print(f'percentage {k}: {count[k] / total:0.2f}')\n",
    "print(f'total: {total}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_all_jobs_matched['date_first_visible'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_jobs_matched"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.12 ('env_indeed2')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "7120590dfa35e6512fb14e5e70b67446c3e78c7a5c027e908dbb14d6a3f8a0eb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
